library(ggplot2)
library(scales)
library(dplyr)
#load('~/Research/taboo/processed_data/forViz1.RData')
load('~/Research/taboo/processed_data/artDF.RData')
load('~/Research/taboo/processed_data/EDA.RData')

ls()
basePath = "~/Research/taboo/" 
dataPath = paste0(basePath, 'processed_data/')
knitrPath = paste0(basePath, 'knitr_rdata/')
figPath = paste0(basePath, 'figures/')
docPath = paste0('~/Dropbox/Apps/Overleaf/[CSCW] Taboo Subjects/figures/')
posterPath = ('~/Dropbox/Apps/Overleaf/[Poster] Taboo Topics/figures/')

options("scipen"=100, "digits"=2)
g <- ggplot(artDF, aes(x=avgEditorEditCount, y=source, fill=source)) +
  #scale_x_continuous(breaks = scales::breaks_log()) +
  #scale_x_continuous(labels = "comma") +
  scale_x_continuous(trans='log') +
  #scale_x_continuous() +
  scale_y_discrete(labels=c("Comparison", "Taboo")) +
  geom_boxplot(alpha=.5, width=0.75) +
  #labs(x='Editor Experience', y='Sample') +
  labs(x='Editor Experience', y=element_blank()) +
  #theme(panel.background = element_blank(), panel.grid = element_blank(), legend.position = "none", axis.line = element_line(color = "black")) +
  theme(panel.background = element_blank(), panel.grid = element_blank(), 
        legend.position = "none", axis.line = element_line(color = "black"),
        axis.text=element_text(size=14)) +
  stat_summary(fun=mean, geom="point", shape=24, size=3, color="black", fill="black") 
g  #print me!
options("scipen"=100, "digits"=4)

g

pdf(paste0(figPath, "boxContribSrc.pdf"), width = 8, height = 3)
g
dev.off()

pdf(paste0(docPath, "boxContribSrc.pdf"), width = 8, height = 3)
g
dev.off()

pdf(paste0(posterPath, "boxContribSrc.pdf"), width = 8, height = 3)
g
dev.off()

#up length

revDF.clean.acct <- subset(revDF.clean, revDF.clean$anon == "false")
revDF.clean.acct$hasUserpage <- case_when(revDF.clean.acct$userpage_text_chars == 0 ~ FALSE, TRUE ~ TRUE)

revDF.clean.acct <- unique(revDF.clean.acct)


#XXX TODO switch to editorDF when that's properly fixed upstream

g <- ggplot(revDF.clean.acct, aes(x=hasUserpage, y=source, fill=source)) +
  #scale_x_continuous(breaks = scales::breaks_log()) +
  #scale_x_continuous(labels = "comma") +
  scale_x_continuous(trans='log') +
  #scale_x_continuous() +
  geom_boxplot(alpha=.5, width=0.75) +
  #labs(x='Editor Page Length', y='Sample') +
  labs(x='Editor Page Length') +
  scale_y_discrete(labels=c("Comparison", "Taboo")) +
  #theme(panel.background = element_blank(), panel.grid = element_blank(), legend.position = "none", axis.line = element_line(color = "black")) +
  theme(panel.background = element_blank(), panel.grid = element_blank(), 
        legend.position = "none", axis.line = element_line(color = "black"),
        axis.text=element_text(size=14)) +
  stat_summary(fun=mean, geom="point", shape=24, size=3, color="black", fill="black") 
g  #print me!
#### NA instead of 0? Looks good for now!

pdf(paste0(figPath, "boxLengthSrc.pdf"), width = 8, height = 3)
g
dev.off()

pdf(paste0(docPath, "boxLengthSrc.pdf"), width = 8, height = 3)
g
dev.off()

pdf(paste0(posterPath, "boxLengthSrc.pdf"), width = 8, height = 3)
g
dev.off()

###note that none of these have bots in them

someUsefulBreaks = c(1, 1000, 1000000, 2000000, 3000000)
g <- ggplot(artDF, aes(x=avgEditorEditCount, y=source, fill=source)) +
  #scale_x_continuous(breaks = scales::breaks_log()) +
  #scale_x_continuous(trans='log10') +
  #scale_x_continuous(breaks=someUsefulBreaks) +
  scale_x_continuous() +
  geom_boxplot(alpha=.2, width=0.75) +
  #labs(x='Editor Experience (No Bots)', y='Sample') +
  labs(x='Editor Experience (No Bots)', y=element_blank()) +
  scale_y_discrete(labels=c("Comparison", "Taboo")) +
  #theme(panel.background = element_blank(), panel.grid = element_blank(), legend.position = "none", axis.line = element_line(color = "black")) +
  theme(panel.background = element_blank(), panel.grid = element_blank(), 
        legend.position = "none", axis.line = element_line(color = "black"),
        axis.text=element_text(size=14)) +
  stat_summary(fun=mean, geom="point", shape=24, size=3, color="black", fill="black") 
g  #print me!
options("scipen"=100, "digits"=4)

pdf(paste0(figPath, "boxContribSrcNoBot.pdf"), width = 8, height = 3)
g
dev.off()

pdf(paste0(docPath, "boxContribSrcNoBot.pdf"), width = 8, height = 3)
g
dev.off()

pdf(paste0(posterPath, "boxContribSrcNoBot.pdf"), width = 8, height = 3)
g
dev.off()
###note that none of these have bots in them

#up length, swap to editorDF when available XXXX to do
g <- ggplot(revDF.clean.acct, aes(x=hasUserpage + 1, y=source, fill=source)) +
  #scale_x_continuous(breaks = scales::breaks_log()) +
  #scale_x_continuous(labels = "comma") +
  #scale_x_continuous(trans='log') +
  scale_x_continuous() +
  geom_boxplot(alpha=.5, width=0.75) +
  #labs(x='Editor Page Length (No Bots)', y='Sample') +
  labs(x='Editor Page Length (No Bots)') + 
  scale_y_discrete(labels=c("Comparison", "Taboo")) +
  #theme(panel.background = element_blank(), panel.grid = element_blank(), legend.position = "none", axis.line = element_line(color = "black")) +
  theme(panel.background = element_blank(), panel.grid = element_blank(), 
        legend.position = "none", axis.line = element_line(color = "black"),
        axis.text=element_text(size=14)) +
  stat_summary(fun=mean, geom="point", shape=24, size=3, color="black", fill="black") 
g  #print me!
#### NA instead of 0?

pdf(paste0(figPath, "boxLengthSrcNoBot.pdf"), width = 8, height = 3)
g
dev.off()

pdf(paste0(docPath, "boxLengthSrcNoBot.pdf"), width = 8, height = 3)
g
dev.off()

pdf(paste0(posterPath, "boxLengthSrcNoBot.pdf"), width = 8, height = 3)
g
dev.off()


g <- ggplot(artDF.dmg, aes(y=source, x=pct_dmg, fill=source, group=source)) +
  geom_boxplot(alpha=0.5, width=0.75) +
  labs(x='Proportion of Damaging Contributions', y=element_blank()) +
  theme_bw() +
  #scale_x_continuous(labels=c('0' = 'Random', '0.5' = 'Both', '1' = 'Taboo'))  + #no worky
  scale_y_discrete(labels=c('Comparison', 'Taboo'))  +
  stat_summary(fun=mean, geom="point", shape=24, size=3, color="black", fill="black") +
#  theme(panel.background = element_blank(), panel.grid = element_blank(), legend.position = "none", axis.line = element_line(color = "black")) 
  theme(panel.background = element_blank(), panel.grid = element_blank(), 
        legend.position = "none", axis.line = element_line(color = "black"),
        axis.text=element_text(size=14)) 
  #theme(legend.position = 'none', axis.title.x = element_blank())
#theme(legend.position = 'none')
g

pdf(paste0(figPath, "damageRate.pdf"), width = 8, height = 3)
g
dev.off()

pdf(paste0(docPath, "damageRate.pdf"), width = 8, height = 3)
g
dev.off()

pdf(paste0(posterPath, "damageRate.pdf"), width = 8, height = 3)
g
dev.off()

g

g <- ggplot(artDF, aes(y=source, x=pct_revert, fill=source, group=source)) +
  geom_boxplot(alpha=0.5, width=0.75) +
  labs(x='Proportion of Reverted Contributions') +
  theme_bw() +
  #scale_x_continuous(labels=c('0' = 'Random', '0.5' = 'Both', '1' = 'Taboo'))  + #no worky
  scale_y_discrete(labels=c('Comparison', 'Taboo'))  +
  stat_summary(fun=mean, geom="point", shape=24, size=3, color="black", fill="black") +
#  theme(legend.position = 'none', axis.title.y = element_blank())
#theme(panel.background = element_blank(), panel.grid = element_blank(), legend.position = "none", axis.line = element_line(color = "black")) 
theme(panel.background = element_blank(), panel.grid = element_blank(), 
      legend.position = "none", axis.line = element_line(color = "black"),
      axis.text=element_text(size=14)) 

  #theme(legend.position = 'none')
g

pdf(paste0(figPath, "revertRate.pdf"), width = 8, height = 3)
g
dev.off()

pdf(paste0(docPath, "revertRate.pdf"), width = 8, height = 3)
g
dev.off()

pdf(paste0(posterPath, "revertRate.pdf"), width = 8, height = 3)
g
dev.off()

g <- ggplot(artDF, aes(y=source, x=revid, fill=source, group=source)) +
  geom_boxplot(alpha=0.5, width=0.75) +
  labs(x='Number of Contributions', y=element_blank()) +
  theme_bw() +
  #scale_x_continuous(labels=c('0' = 'Random', '0.5' = 'Both', '1' = 'Taboo'))  + #no worky
  scale_y_discrete(labels=c('Comparison', 'Taboo'))  +
  stat_summary(fun=mean, geom="point", shape=24, size=3, color="black", fill="black") +
#  theme(legend.position = 'none', axis.title.y = element_blank())
#theme(panel.background = element_blank(), panel.grid = element_blank(), legend.position = "none", axis.line = element_line(color = "black")) 
  theme(panel.background = element_blank(), panel.grid = element_blank(), 
        legend.position = "none", axis.line = element_line(color = "black"),
        axis.text=element_text(size=14)) 
  #theme(legend.position = 'none')
g

pdf(paste0(figPath, "volumeBox.pdf"), width = 8, height = 3)
g
dev.off()

pdf(paste0(docPath, "volumeBox.pdf"), width = 8, height = 3)
g
dev.off()

pdf(paste0(posterPath, "volumeBox.pdf"), width = 8, height = 3)
g
dev.off()

g <- ggplot(artDF, aes(x=revid, y=pct_revert, color=source)) +
  geom_point() +
  scale_color_discrete(name="Source", labels=c("Comparison", "Taboo")) +
  labs(x='Number of Revisions', y='Proportion Reverted')

g

g <- ggplot(artDF, aes(x=pct_revert, color=source)) +
  scale_color_discrete(name="Source", labels=c("Comparison", "Taboo")) +
  geom_boxplot(width=0.25)
g

g <- ggplot(artDF, aes(group=source, x=revid, y=pct_revert, color=source)) +
  geom_point(alpha=.2) +
  geom_smooth() +
  geom_rug(alpha=.2)+
  scale_color_discrete(name="Source", labels=c("Comparison", "Taboo")) +
  #scale_x_continuous() +
  scale_y_continuous() +
  theme_bw() +
  theme(legend.position = 'bottom', legend.title = element_blank()) +
  #stat_summary(fun=mean, geom="point", shape=24, size=3, color="black", fill="black") +
  scale_x_continuous(trans='log') +
  #scale_y_continuous(trans='log') +
  labs(x='Number of Revisions', y='Proportion Reverted')

g

pdf(paste0(figPath, "revRateCount.pdf"), width = 8, height = 6)
g
dev.off()

pdf(paste0(docPath, "revRateCount.pdf"), width = 8, height = 6)
g
dev.off()

pdf(paste0(posterPath, "revRateCount.pdf"), width = 8, height = 6)
g
dev.off()






#################stop here
## clean everything
################
####################### part 2 starts here
load('~/Research/taboo/processed_data/forViz2.RData') ## contains vqDF and atBirthDF
basePath = "~/Research/taboo/" 
dataPath = paste0(basePath, 'processed_data/')
knitrPath = paste0(basePath, 'knitr_rdata/')
figPath = paste0(basePath, 'figures/')
docPath = paste0('~/Dropbox/Apps/Overleaf/[CSCW] Taboo Subjects/figures/')
posterPath = ('~/Dropbox/Apps/Overleaf/[Poster] Taboo Topics/figures/')
secondsInAYear = 365.25 * 24 * 60 * 60
library(ggplot2)

g <- ggplot(artDF, aes(x=avg_quality, y=source, fill=source)) +
  #scale_x_continuous(trans="log") +
  geom_boxplot(alpha=.5, width=0.75) +
  #labs(x='Quality', y='Sample') +
  labs(x='Quality', y=element_blank()) +
  scale_y_discrete(labels=c("Comparison", "Taboo")) +
  theme_bw() +
#  theme(panel.background = element_blank(), panel.grid = element_blank(), legend.position = "none", axis.line = element_line(color = "black")) +
  theme(panel.background = element_blank(), panel.grid = element_blank(), 
        legend.position = "none", axis.line = element_line(color = "black"),
        axis.text=element_text(size=14)) +
  stat_summary(fun=mean, geom="point", shape=24, size=3, color="black", fill="black") 
g  #print me!

pdf(paste0(figPath, "boxQualitySrc.pdf"), width = 8, height = 3)
g
dev.off()

pdf(paste0(docPath, "boxQualitySrc.pdf"), width = 8, height = 3)
g
dev.off()

pdf(paste0(posterPath, "boxQualitySrc.pdf"), width = 8, height = 3)
g
dev.off()
# for each hypothesis test, build a similar faceted plot showing each group

artDF$yearsOld <- artDF$secondsOldLog / secondsInAYear

g <- ggplot(artDF) + 
  geom_boxplot(aes(x=yearsOld, y=avg_quality, group=yearsOld), width=0.75) +
  facet_wrap(~source)
g
pdf(paste0(figPath, "boxAgeQualityYrSrc.pdf"), width = 8, height = 6)
g
dev.off()

pdf(paste0(docPath, "boxAgeQualityYrSrc.pdf"), width = 8, height = 6)
g
dev.off()

pdf(paste0(posterPath, "boxAgeQualityYrSrc.pdf"), width = 8, height = 6)
g
dev.off()
## quality at birth

g <- ggplot(atBirthDF, aes(x=birthday, y=weighted_sum, color=source)) +
  geom_smooth() +
  geom_rug() +
  theme_bw() +
  scale_color_discrete(name="Source", labels=c("Comparison", "Taboo")) +
  theme(legend.position = 'bottom', legend.title = element_blank()) +
  labs(x="Article Creation Date", y="Quality When Created")

g

pdf(paste0(figPath, "qualityAtBirth.pdf"), width = 8, height = 6)
g
dev.off()

pdf(paste0(docPath, "qualityAtBirth.pdf"), width = 8, height = 6)
g
dev.off()

pdf(paste0(posterPath, "qualityAtBirth.pdf"), width = 8, height = 6)
g
dev.off()
options("scipen"=100, "digits"=2)

g <- ggplot(qDF, aes(x=monthsOld, y=weighted_sum, color=source)) +
  #geom_point(alpha=.01) +
  geom_smooth() +
  theme_bw() +
  theme(legend.position = 'bottom', legend.title = element_blank()) +
  #stat_summary(fun=mean, geom="point", shape=24, size=3, color="black", fill="black") +
  scale_color_discrete(name="Source", labels=c("Comparison", "Taboo")) +
  labs(x="Months Since Article Creation", y="Quality")

g 

pdf(paste0(figPath, "ageGrowth.pdf"), width = 8, height = 6)
g
dev.off()

pdf(paste0(docPath, "ageGrowth.pdf"), width = 8, height = 6)
g
dev.off()


pdf(paste0(posterPath, "ageGrowth.pdf"), width = 8, height = 6)
g
dev.off()

